/*
* Copyright (C) 2010 eXo Platform SAS.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.xcmis.search.lucene.index;
import org.apache.commons.lang.NotImplementedException;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xcmis.search.config.IndexConfiguration;
import org.xcmis.search.content.ContentEntry;
import org.xcmis.search.content.ContentIndexer;
import org.xcmis.search.content.Property;
import org.xcmis.search.content.Property.BinaryValue;
import org.xcmis.search.content.Property.ContentValue;
import org.xcmis.spi.utils.Logger;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.Collection;
/**
* Create {@link Document} from {@link ContentEntry}
*/
public class LuceneIndexer implements ContentIndexer<Document>
{
private final IndexConfiguration indexConfiguration;
private final AutoDetectParser parser;
/**
* Class logger.
*/
private static final Logger LOG = Logger.getLogger(LuceneIndexer.class);
/**
* @param extractor
*/
public LuceneIndexer(IndexConfiguration indexConfiguration)
{
super();
this.parser = new AutoDetectParser(indexConfiguration.getTikaConfiguration());
this.indexConfiguration = indexConfiguration;
}
/**
*
* @see org.xcmis.search.content.ContentIndexer#createDocument(org.xcmis.search.content.ContentEntry)
*/
public Document createDocument(ContentEntry contentEntry)
{
final Document doc = new Document();
// UUID
doc.add(new Field(FieldNames.UUID, contentEntry.getIdentifier(), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
//root
if (contentEntry.getParentIdentifiers().length == 0)
{
doc.add(new Field(FieldNames.PARENT, indexConfiguration.getRootParentUuid(), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
doc.add(new Field(FieldNames.LABEL, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS,
Field.TermVector.NO));
}
else
{
//parent uuids
for (int i = 0; i < contentEntry.getParentIdentifiers().length; i++)
{
String parentIdetifier = contentEntry.getParentIdentifiers()[i];
doc.add(new Field(FieldNames.PARENT, parentIdetifier, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS,
Field.TermVector.NO));
doc.add(new Field(FieldNames.LABEL, contentEntry.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
}
}
//table names
for (int i = 0; i < contentEntry.getTableNames().length; i++)
{
doc.add(new Field(FieldNames.TABLE_NAME, contentEntry.getTableNames()[i], Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
}
for (int i = 0; i < contentEntry.getProperties().length; i++)
{
Property property = contentEntry.getProperties()[i];
if (isIndexed(property.getName()))
{
addProperty(doc, property);
}
}
return doc;
}
/**
* Extract content of binary value.
*
* @param doc
* @param propName
* @param data
*/
private void addBinaryProperty(final Document doc, String propName, BinaryValue data)
{
if (data.getMimeType() != null)
{
if (parser != null)
{
final InputStream is = data.getValue();
try
{
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, data.getMimeType());
if (data.getEncoding() != null)
{
metadata.set(Metadata.CONTENT_ENCODING, data.getEncoding());
}
BodyContentHandler handler = new BodyContentHandler();
parser.parse(is, handler, metadata);
final Field f =
new Field(FieldNames.createFullTextFieldName(propName), handler.toString(), Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.NO);
doc.add(f);
}
catch (IOException e)
{
// no data - no index
if (LOG.isDebugEnabled())
{
LOG.warn("Binary value indexer IO error " + e, e);
}
}
catch (SAXException e)
{
// no data - no index
if (LOG.isDebugEnabled())
{
LOG.warn("Binary value indexer IO error " + e, e);
}
}
catch (TikaException e)
{ // no data - no index
if (LOG.isDebugEnabled())
{
LOG.warn("Binary value indexer IO error " + e, e);
}
}
finally
{
if (is != null)
{
try
{
is.close();
}
catch (IOException e)
{
if (LOG.isDebugEnabled())
{
LOG.warn("Binary value indexer IO error " + e, e);
}
}
}
}
}
}
}
/**
* Adds the string representation of the boolean value to the document as the
* named field.
*
* @param doc
* The document to which to add the field
* @param fieldName
* The name of the field to add
* @param internalValue
* The value for the field to add to the document.
*/
private void addBooleanValue(final Document doc, final String fieldName, final Boolean internalValue)
{
doc.add(createFieldWithoutNorms(fieldName, internalValue.toString(), false));
}
/**
* Adds the calendar value to the document as the named field. The calendar
* value is converted to an indexable string value using the
* {@link DateTools} class.
*
* @param doc
* The document to which to add the field
* @param fieldName
* The name of the field to add
* @param value
* The value for the field to add to the document.
*/
private void addCalendarValue(final Document doc, final String fieldName, final Calendar value)
{
doc.add(createFieldWithoutNorms(fieldName, DateTools.dateToString(value.getTime(),
DateTools.Resolution.MILLISECOND), false));
}
/**
* Adds the double value to the document as the named field. The double value
* is converted to an indexable string value using the {@link DoubleField}
* class.
*
* @param doc
* The document to which to add the field
* @param fieldName
* The name of the field to add
* @param internalValue
* The value for the field to add to the document.
*/
private void addDoubleValue(final Document doc, final String fieldName, final Double doubleValue)
{
doc.add(createFieldWithoutNorms(fieldName, ExtendedNumberTools.doubleToString(doubleValue), false));
}
/**
* Adds the length field.
*
* @param doc
* @param propName
* - property name.
* @param value
*/
private void addLengthField(Document doc, String propName, ContentValue value)
{
doc.add(new Field(FieldNames.createFieldLengthName(propName), //
NumberTools.longToString(value.getLength()), //
Store.YES, //
Index.NOT_ANALYZED_NO_NORMS));
}
/**
* Adds the long value to the document as the named field. The long value is
* converted to an indexable string value using the {@link NumberTools}
* class.
*
* @param doc
* The document to which to add the field
* @param fieldName
* The name of the field to add
* @param longValue
* The value for the field to add to the document.
*/
private void addLongValue(final Document doc, final String fieldName, final Long longValue)
{
doc.add(createFieldWithoutNorms(fieldName, NumberTools.longToString(longValue), false));
}
/**
* Adds a {@link FieldNames#MVP} field to <code>doc</code> with the resolved
* <code>name</code> using the internal search index namespace mapping.
*
* @param doc
* the lucene document.
* @param propName
* the name of the multi-value property.
* @throws RepositoryException
* if any repository errors
*/
private void addMVPName(final Document doc, final String propName)
{
doc.add(new Field(FieldNames.MVP, propName, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
}
/**
* Adds the non binary property.
*
* @param doc
* the doc
* @param propertyData
* the property data
* @throws RepositoryException
* the repository exception
*/
@SuppressWarnings("unchecked")
private void addProperty(final Document doc, final Property propertyData)
{
final String propName = propertyData.getName();
addPropertyName(doc, propName);
Collection<ContentValue> data = propertyData.getValue();
for (ContentValue value : data)
{
switch (propertyData.getType())
{
case BINARY :
addBinaryProperty(doc, propName, ((BinaryValue)value));
break;
case BOOLEAN :
//property marked as boolean so it should be possible to convert it to boolean
addBooleanValue(doc, propName, Boolean.parseBoolean(value.getValue().toString()));
break;
case NAME :
case PATH :
case STRING :
//property marked as string so it should be possible to convert it to string
this.addStringValue(doc, propName, value.getValue().toString(), true);
break;
case LONG :
//property marked as long so it should be possible to convert it to long
addLongValue(doc, propName, Long.parseLong(value.getValue().toString()));
break;
case DOUBLE :
//property marked as long so it should be possible to convert it to double
addDoubleValue(doc, propName, Double.parseDouble(value.getValue().toString()));
break;
case DATE :
//value should be calendar
addCalendarValue(doc, propName, (Calendar)value.getValue());
break;
default :
throw new NotImplementedException();
}
addLengthField(doc, propName, value);
}
if (data.size() > 1)
{
// real multi-valued
addMVPName(doc, propName);
}
}
/**
* Adds the property name to the lucene _:PROPERTIES_SET field.
*
* @param doc
* the document.
* @param name
* the name of the property.
* @throws RepositoryException
* if any repository errors
*/
private void addPropertyName(final Document doc, final String propertyName)
{
doc.add(new Field(FieldNames.PROPERTIES_SET, propertyName, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
}
/**
* Adds the string value to the document both as the named field and
* optionally for full text indexing if <code>tokenized</code> is
* <code>true</code>.
*
* @param doc
* The document to which to add the field
* @param fieldName
* The name of the field to add
* @param internalValue
* The value for the field to add to the document.
* @param tokenized
* If <code>true</code> the string is also tokenized and fulltext
* indexed.
*/
private void addStringValue(final Document doc, final String fieldName, final String stringValue,
final boolean tokenized)
{
// simple String
doc.add(createFieldWithoutNorms(fieldName, stringValue, false));
if (tokenized)
{
if (stringValue.length() != 0)
{
// create fulltext index on property
doc.add(new Field(FieldNames.createFullTextFieldName(fieldName), stringValue, Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.NO));
}
}
}
/**
* Creates a document field name as prefixed <code>fieldName</code> with the
* value of <code>
* internalValue</code> . The created field is indexed without
* norms.
*
* @param fieldName
* The name of the field to add
* @param internalValue
* The value for the field to add to the document.
* @param store
* <code>true</code> if the value should be stored,
* <code>false</code> otherwise
* @return field Field
*/
private Field createFieldWithoutNorms(final String fieldName, final String internalValue, final boolean store)
{
final Field field =
new Field(FieldNames.createPropertyFieldName(fieldName), internalValue, store ? Field.Store.YES
: Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO);
return field;
}
/**
* Returns <code>true</code> if the property with the given name should be
* indexed.
*
* @param propertyName
* name of a property.
* @return <code>true</code> if the property should be fulltext indexed;
* <code>false</code> otherwise.
*/
private boolean isIndexed(final String propertyName)
{
return true;
}
}